********************************************************************************
* 04_merge_event_data.do
* Merge ETS Treatment Data with IPEDS Data to Create Analysis Datasets
*
* Inputs:
*   - data/cleaned/ets_treatment_data.xlsx
*   - data/cleaned/ipeds_data_cleaned.xlsx
*
* Outputs:
*   - data/cleaned/enrollment_event_data.xlsx
*   - data/cleaned/graduation_event_data.xlsx
*
* Notes:
*   - ND and TN are KEPT in the output; downstream scripts handle exclusion.
*   - Creates log outcome variables and event study interaction terms
*   - Adds selectivity classification based on 2010 SAT/ACT scores
*   - Applies CT post-2017 (drop) and SC post-2017 (set to -0.6576) adjustments
********************************************************************************

clear all
set more off

display "========================================"
display "04: Merging ETS Treatment and IPEDS Data"
display "========================================"

* Ensure carryforward is installed (used for selectivity classification)
capture which carryforward
if _rc {
    display "  Installing carryforward..."
    ssc install carryforward, replace
}

capture mkdir "data/cleaned"

* ══════════════════════════════════════════════════════════════════════════════
* CONTROL VARIABLE GLOBALS (for reference)
* ══════════════════════════════════════════════════════════════════════════════

global state_demo  "real_income unemployment_rate"
global state_kraft "passevals implementevals eliminate_tenure increase_probationary_period weaken_bargaining eliminate_union_dues won_race_top common_core edtpa"
global uni_controls "test_optional satpct2 actpct2 satvr25_2 satvr75_2 satmt25_2 satmt75_2 actcm25_2 actcm75_2 pell_percent2 pell_amount2 loan_percent2 loan_average2 enrollment_total2"

* ══════════════════════════════════════════════════════════════════════════════
* 1. LOAD AND PREPARE IPEDS DATA
* ══════════════════════════════════════════════════════════════════════════════

display ""
display "Loading IPEDS data..."

import excel "data/cleaned/ipeds_data_cleaned.xlsx", firstrow clear

* Keep only 2008-2020
keep if year >= 2008 & year <= 2020

* Rename state_abbr to State (matching ETS data and downstream expectations)
rename state_abbr State

* Drop territories and non-states
drop if inlist(State, "AS", "FN", "MH", "MP", "PR", "VI", "GU")

* Keep only sample states (22 in-sample + ND and TN which get dropped later)
keep if inlist(State, "AK", "AR", "CT", "DC", "DE", "HI", "LA", "ME") | ///
       inlist(State, "MD", "MS", "NC", "NE", "NV", "NH", "NJ", "OR") | ///
       inlist(State, "PA", "SC", "VA", "VT", "WV", "WI", "ND", "TN")

display "  IPEDS after state filter: " _N " observations"

* ──────────────────────────────────────────────────────────────────────────────
* Rename IPEDS variables to match downstream column names
* ──────────────────────────────────────────────────────────────────────────────

* Completions: use ba + masters teacher preparation completions
rename ba_masters_teacher_preparation_completions_total   ctotalt
rename ba_masters_teacher_preparation_completions_male    ctotalm
rename ba_masters_teacher_preparation_completions_female  ctotalw
rename ba_masters_teacher_preparation_completions_white   cwhitt
rename ba_masters_teacher_preparation_completions_black   cbkaat
rename ba_masters_teacher_preparation_completions_hispanic chispt

* Additional completion race/ethnicity vars (for graduation data)
* These may not exist in IPEDS R download — generate as 0 if missing
capture rename ba_masters_educationcip2_total_completions cip2_total
foreach var in c2mort cunknt casiat cbkaam cbkaaw chispm chispw cwhitm cwhitw {
    capture confirm variable `var'
    if _rc {
        gen `var' = 0
    }
}

* Enrollment variables
rename all_total_ed_enrollment eftotlt
capture rename all_male_ed_enrollment eftotlm
capture rename all_female_ed_enrollment eftotlw
capture rename all_white_ed_enrollment efwhitt
capture rename all_black_ed_enrollment efbkaat_enrl
capture rename all_hispanic_ed_enrollment efhispt_enrl

* Generate additional enrollment race variables if missing
foreach var in efbkaam efbkaaw efhispm efhispw efwhitm efwhitw efaiant efasiat ef2mort efunknt {
    capture confirm variable `var'
    if _rc {
        gen `var' = 0
    }
}

* Rename enrollment race vars to expected names (careful: efbkaat is for enrollment)
* We have cbkaat for completions and efbkaat_enrl for enrollment
rename efbkaat_enrl efbkaat
rename efhispt_enrl efhispt

* Directory / institutional variables
rename inst_name name
capture rename inst_alias city_alias
capture gen city = ""
rename fips statefips
capture rename region carnegie
capture gen ccbasic = .

* Rename admissions variables to match expected names
capture rename sat_crit_read_25_pctl satvr25
capture rename sat_math_25_pctl satmt25
capture rename act_composite_25_pctl actcm25
capture rename sat_crit_read_75_pctl satvr75
capture rename sat_math_75_pctl satmt75
capture rename act_composite_75_pctl actcm75
capture rename sat_crit_read_50_pctl satvr50
capture rename sat_math_50_pctl satmt50
capture rename act_composite_50_pctl actcm50

* Rename financial aid variables
capture rename pell_percent pell_pct_raw
capture rename pell_average_amount pell_amt_raw

* Rename enrollment-related admission vars
capture rename sat_percent_submitting satpct_raw
capture rename act_percent_submitting actpct_raw
capture rename enrollment_fall_fulltime_firsttime_undergrad enrollment_total_raw

* Generate test_optional indicator
capture confirm variable open_admissions_policy
if !_rc {
    gen test_optional = (open_admissions_policy == 1)
}
else {
    gen test_optional = 0
}

* Population / demographic placeholders (not available from IPEDS)
foreach var in cohort_5_9 cohort_10_14 cohort_15_17 ///
               population pop_5_17 percentage_change_08_18 pop_percentage_change ///
               state_name statename {
    capture confirm variable `var'
    if _rc {
        if "`var'" == "state_name" | "`var'" == "statename" {
            gen `var' = ""
        }
        else {
            gen `var' = .
        }
    }
}

* ──────────────────────────────────────────────────────────────────────────────
* Merge state-level economic and policy controls from state_treatment.dta
* ──────────────────────────────────────────────────────────────────────────────

* Save current data, load state_treatment, merge back
tempfile ipeds_pre_merge
save `ipeds_pre_merge'

capture confirm file "data/raw/policy/state_treatment.dta"
if !_rc {
    display "  Merging state-level controls from state_treatment.dta..."
    use "data/raw/policy/state_treatment.dta", clear
    keep State year real_income unemployment_rate ///
         passevals implementevals eliminate_tenure ///
         increase_probationary_period weaken_bargaining ///
         eliminate_union_dues won_race_top common_core edtpa
    duplicates drop
    tempfile state_controls
    save `state_controls'

    use `ipeds_pre_merge', clear
    merge m:1 State year using `state_controls', nogenerate keep(master match)
    display "  State controls merged."
}
else {
    display "  WARNING: state_treatment.dta not found. Controls will be missing."
    use `ipeds_pre_merge', clear
    foreach var in real_income unemployment_rate ///
                   passevals implementevals eliminate_tenure ///
                   increase_probationary_period weaken_bargaining ///
                   eliminate_union_dues won_race_top common_core edtpa {
        gen `var' = .
    }
}

* ──────────────────────────────────────────────────────────────────────────────
* Create time-varying control variables with _2 suffix
* (These are lagged/smoothed versions used in regressions)
* ──────────────────────────────────────────────────────────────────────────────

* SAT/ACT percentages
capture confirm variable satpct_raw
if !_rc {
    gen satpct2 = satpct_raw
}
else {
    gen satpct2 = .
}

capture confirm variable actpct_raw
if !_rc {
    gen actpct2 = actpct_raw
}
else {
    gen actpct2 = .
}

* SAT/ACT score percentiles with _2 suffix
gen satvr25_2 = satvr25
capture gen satvr75_2 = satvr75
gen satmt25_2 = satmt25
capture gen satmt75_2 = satmt75
gen actcm25_2 = actcm25
capture gen actcm75_2 = actcm75

* Financial aid with _2 suffix
capture confirm variable pell_pct_raw
if !_rc {
    gen pell_percent2 = pell_pct_raw
}
else {
    gen pell_percent2 = .
}

capture confirm variable pell_amt_raw
if !_rc {
    gen pell_amount2 = pell_amt_raw
}
else {
    gen pell_amount2 = .
}

* Loan variables — generate as missing if not in IPEDS download
gen loan_percent2 = .
gen loan_average2 = .

* Total enrollment
capture confirm variable enrollment_total_raw
if !_rc {
    gen enrollment_total2 = enrollment_total_raw
}
else {
    gen enrollment_total2 = .
}

display "  IPEDS variables prepared: " _N " observations"

* Save prepared IPEDS data
tempfile ipeds_prepared
save `ipeds_prepared'

* ══════════════════════════════════════════════════════════════════════════════
* 2. LOAD ETS TREATMENT DATA
* ══════════════════════════════════════════════════════════════════════════════

display ""
display "Loading ETS treatment data..."

import excel "data/cleaned/ets_treatment_data.xlsx", firstrow clear

display "  ETS treatment data: " _N " observations"

* Keep variables needed for merge
* (State, year, test_index, test_index_composite, continuous_treat,
*  time_till, treatment_year, lead/lag variables, year_XXXX interactions)

tempfile ets_data
save `ets_data'

* ══════════════════════════════════════════════════════════════════════════════
* 3. CREATE ENROLLMENT EVENT DATA
* ══════════════════════════════════════════════════════════════════════════════

display ""
display "========================================"
display "Creating Enrollment Event Data"
display "========================================"

use `ipeds_prepared', clear

* ──────────────────────────────────────────────────────────────────────────────
* Merge with ETS treatment data (many:1 on State year)
* ──────────────────────────────────────────────────────────────────────────────

merge m:1 State year using `ets_data'
drop _merge

sort unitid year

* ──────────────────────────────────────────────────────────────────────────────
* Drop universities with zero total completions across all years
* ──────────────────────────────────────────────────────────────────────────────

egen university_graduates = sum(ctotalt), by(unitid)
drop if university_graduates == 0
drop university_graduates

* Drop universities with zero completions in even years only
egen university_graduates = sum(ctotalt) if ///
    year == 2008 | year == 2010 | year == 2012 | ///
    year == 2014 | year == 2016 | year == 2018 | year == 2020, by(unitid)
drop if university_graduates == 0
drop university_graduates

* Drop universities with zero completions in interior years
egen university_graduates = sum(ctotalt) if year != 2008 & year != 2020, by(unitid)
drop if university_graduates == 0
drop university_graduates

* ──────────────────────────────────────────────────────────────────────────────
* Keep only even years (biennial enrollment panel)
* ──────────────────────────────────────────────────────────────────────────────

drop if year == 2009 | year == 2011 | year == 2013 | ///
        year == 2015 | year == 2017 | year == 2019 | year == 2020

* ──────────────────────────────────────────────────────────────────────────────
* Drop universities with zero enrollment
* ──────────────────────────────────────────────────────────────────────────────

egen university_enrollment = sum(eftotlt), by(unitid)
drop if university_enrollment == 0
drop university_enrollment

* Replace missing enrollment race variables
replace efbkaat = 0 if eftotlt == 0
replace efwhitt = 0 if eftotlt == 0
replace efhispt = 0 if eftotlt == 0

* Drop brute-force exclusion unitids (matches DROP_UNITIDS in R version)
drop if inlist(unitid, 221856, 186371, 233602, 233912, 216524) | ///
        inlist(unitid, 128902, 211440, 128780, 181428, 413723)

* ──────────────────────────────────────────────────────────────────────────────
* Create log outcome variables
* ──────────────────────────────────────────────────────────────────────────────

gen l_eftotlt = log(eftotlt + 1)
gen l_efbkaat = log(efbkaat + 1)
gen l_efwhitt = log(efwhitt + 1)
gen l_efhispt = log(efhispt + 1)
gen non_white = eftotlt - efwhitt
gen l_nonwhite = log(non_white + 1)
gen black_hispanic = efbkaat + efhispt
gen l_blackhipanic = log(black_hispanic + 1)
gen l_efbkaat2 = log(efbkaat)

* ──────────────────────────────────────────────────────────────────────────────
* Create event study interaction variables (biennial)
* year_XXXX = DeltaTDI * I(year == XXXX)
* ──────────────────────────────────────────────────────────────────────────────

* Drop any year_XXXX vars inherited from ETS merge (re-create for enrollment)
foreach y in 2008 2010 2012 2014 2016 2018 {
    capture drop year_`y'
    gen year_`y' = 0
}

replace year_2008 = continuous_treat if time_till == -5
replace year_2010 = continuous_treat if time_till == -3
replace year_2012 = continuous_treat if time_till == -1
replace year_2014 = continuous_treat if time_till == 1
replace year_2016 = continuous_treat if time_till == 3
replace year_2018 = continuous_treat if time_till == 5

* Reference year: 2012
replace year_2012 = 0

* PA and SC 2018 indicators
gen pa2018 = (State == "PA" & year == 2018)
gen sc2018 = (State == "SC" & year == 2018)

* ──────────────────────────────────────────────────────────────────────────────
* Export enrollment event data
* ──────────────────────────────────────────────────────────────────────────────

display ""
display "  Enrollment event data: " _N " observations"
quietly tab unitid
display "  Universities: " r(r)
quietly tab State
display "  States: " r(r)

export excel using "data/cleaned/enrollment_event_data.xlsx", firstrow(variables) replace
display "  Saved: data/cleaned/enrollment_event_data.xlsx"

* ══════════════════════════════════════════════════════════════════════════════
* 4. CREATE GRADUATION EVENT DATA
* ══════════════════════════════════════════════════════════════════════════════

display ""
display "========================================"
display "Creating Graduation Event Data"
display "========================================"

use `ipeds_prepared', clear

* ──────────────────────────────────────────────────────────────────────────────
* Merge with ETS treatment data (many:1 on State year)
* ──────────────────────────────────────────────────────────────────────────────

merge m:1 State year using `ets_data'
drop _merge

sort unitid year

* ──────────────────────────────────────────────────────────────────────────────
* Drop universities with zero total completions across all years
* ──────────────────────────────────────────────────────────────────────────────

egen university_graduates = sum(ctotalt), by(unitid)
drop if university_graduates == 0
drop university_graduates

* Drop universities with zero completions in even years
egen university_graduates = sum(ctotalt) if ///
    year == 2008 | year == 2010 | year == 2012 | ///
    year == 2014 | year == 2016 | year == 2018 | year == 2020, by(unitid)
drop if university_graduates == 0
drop university_graduates

* Drop universities with zero completions in interior years
egen university_graduates = sum(ctotalt) if year != 2008 & year != 2020, by(unitid)
drop if university_graduates == 0
drop university_graduates

* ──────────────────────────────────────────────────────────────────────────────
* Merge with enrollment data to verify consistency
* (Enrollment data serves as a cross-check; keep all graduation rows)
* ──────────────────────────────────────────────────────────────────────────────

* Universities with zero enrollment across all years get dropped
egen university_enrollment = sum(eftotlt), by(unitid)
drop if university_enrollment == 0
drop university_enrollment

* For graduation data with missing enrollment in early/late years,
* set completions to 0 for even years where enrollment data was missing
* (mirrors original logic for merge == 1 cases)

* Drop brute-force exclusion unitids (matches DROP_UNITIDS in R version)
drop if inlist(unitid, 221856, 186371, 233602, 233912, 216524) | ///
        inlist(unitid, 128902, 211440, 128780, 181428, 413723)

* ──────────────────────────────────────────────────────────────────────────────
* Create log outcome variables
* ──────────────────────────────────────────────────────────────────────────────

gen l_ctotalt = log(ctotalt + 1)
gen l_cbkaat = log(cbkaat + 1)
gen l_chispt = log(chispt + 1)
gen l_cwhitt = log(cwhitt + 1)
gen l_c2mort = log(c2mort + 1)
gen l_cunknt = log(cunknt + 1)

gen cnonwhite = ctotalt - cwhitt
gen l_cnonwhite = log(cnonwhite + 1)

gen l_male = log(ctotalm + 1)
gen l_female = log(ctotalw + 1)

* ──────────────────────────────────────────────────────────────────────────────
* Create selectivity classification (based on 2010 SAT/ACT scores)
* ──────────────────────────────────────────────────────────────────────────────

* Carry forward 2008 SAT/ACT values within each university
sort unitid year
gen satv_2008 = satvr25 if year == 2008
bysort unitid (year): carryforward satv_2008, replace

gen satm_2008 = satmt25 if year == 2008
bysort unitid (year): carryforward satm_2008, replace

gen act_2008 = actcm25 if year == 2008
bysort unitid (year): carryforward act_2008, replace

replace satv_2008 = 0 if satv_2008 == .
replace act_2008 = 0 if act_2008 == .

* 2010 SAT/ACT for selectivity cutoff
gen satv_2010 = satvr25 if year == 2010
sort unitid year
bysort unitid (year): carryforward satv_2010, replace
gsort unitid -year
bysort unitid: carryforward satv_2010, replace
replace satv_2010 = 0 if satv_2010 == .

gen act_2010 = actcm25 if year == 2010
sort unitid year
bysort unitid (year): carryforward act_2010, replace
gsort unitid -year
bysort unitid: carryforward act_2010, replace

* Selectivity: above SAT verbal 430 or ACT 19 in 2010
gen selective = .
replace selective = 1 if satv_2010 > 430
replace selective = 1 if act_2010 >= 19 & satv_2010 == 0 & act_2010 != .
replace selective = 0 if selective == .

* ──────────────────────────────────────────────────────────────────────────────
* Drop 2008 (graduation data starts at 2009)
* ──────────────────────────────────────────────────────────────────────────────

drop if year == 2008

* ──────────────────────────────────────────────────────────────────────────────
* Create event study interaction variables (annual)
* Rename lead/lag variables from ETS merge to year_XXXX format
* ──────────────────────────────────────────────────────────────────────────────

* Drop any year_XXXX vars inherited from ETS merge, re-create from lead/lag
foreach y in 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 {
    capture drop year_`y'
}

rename lead5  year_2008
rename lead4  year_2009
rename lead3  year_2010
rename lead2  year_2011
rename lead1  year_2012
rename lag0   year_2013
rename lag1   year_2014
rename lag2   year_2015
rename lag3   year_2016
rename lag4   year_2017
rename lag5   year_2018
rename lag6   year_2019
rename lag7   year_2020

* Reference year: 2012
replace year_2012 = 0

* ──────────────────────────────────────────────────────────────────────────────
* Export graduation event data
* ──────────────────────────────────────────────────────────────────────────────

display ""
display "  Graduation event data: " _N " observations"
quietly tab unitid
display "  Universities: " r(r)
quietly tab State
display "  States: " r(r)

sort unitid year
export excel using "data/cleaned/graduation_event_data.xlsx", firstrow(variables) replace
display "  Saved: data/cleaned/graduation_event_data.xlsx"

* ──────────────────────────────────────────────────────────────────────────────
* Summary
* ──────────────────────────────────────────────────────────────────────────────

display ""
display "========================================"
display "Merge Summary"
display "========================================"
display "Created:"
display "  - data/cleaned/enrollment_event_data.xlsx"
display "  - data/cleaned/graduation_event_data.xlsx"
display ""
display "04_merge_event_data.do complete."
